/****************************************************************************** * Copyright (c) 2010 Basis Technology Corp. * * Basis Technology Corp. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.basistech.readability; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Java version of the arclab readability javascript program. This uses jsoup to handle the DOM tree and * provide us with the sorts of operations that the javascript code loves. Make one of these objects for each * page. Provide it with an object to fetch more next pages to support that stuff. */ public class Readability { private static final Logger LOG = LoggerFactory.getLogger(Readability.class); private static final Set<String> DIV_TO_P_ELEMENTS; static { DIV_TO_P_ELEMENTS = new HashSet<String>(); DIV_TO_P_ELEMENTS.add("a"); DIV_TO_P_ELEMENTS.add("blockquote"); DIV_TO_P_ELEMENTS.add("dl"); DIV_TO_P_ELEMENTS.add("div"); DIV_TO_P_ELEMENTS.add("img"); DIV_TO_P_ELEMENTS.add("ol"); DIV_TO_P_ELEMENTS.add("p"); DIV_TO_P_ELEMENTS.add("pre"); DIV_TO_P_ELEMENTS.add("table"); DIV_TO_P_ELEMENTS.add("ul"); } private Document document; private Element body; private PageReader pageReader; private String givenUrl; private Set<String> parsedPages; private boolean impossible; private String title; private boolean stripUnlikelyCandidates = true; private boolean classWeight = true; private boolean cleanConditionally = true; private String nextPageLink; private String articleText; private boolean readAllPages; private boolean notFirstPage; private NekoJsoupParser nekoParser = new NekoJsoupParser(); // for some testing and debugging purposes, obtain string reps of the XML we // got from parsing. private List<String> xmlImages; public Readability() { parsedPages = new HashSet<String>(); } /** * Process the content of a page. This takes a String, since JSoup does not handle byte input. Caller has * to worry about charset detection and conversion. * * @param url the initial url */ public void processDocument(String url) throws PageReadException { // TODO: reset the results. impossible = false; givenUrl = url; nextPageLink = null; if (!notFirstPage) { xmlImages = new ArrayList<String>(); title = null; } String content = pageReader.readPage(url); document = Jsoup.parse(content); if (document.getElementsByTag("body").size() == 0) { LOG.error("no body to parse " + url); impossible = true; throw new PageReadException("no body to parse"); } init(); // this needs another name, it does all the work. if (readAllPages && nextPageLink != null) { try { String textSoFar = articleText; notFirstPage = true; processDocument(nextPageLink); if (articleText != null) { articleText = textSoFar + articleText; } } finally { notFirstPage = false; } } } private void removeScripts() { Elements scripts = document.getElementsByTag("script"); for (int i = scripts.size() - 1; i >= 0; i--) { Element e = scripts.get(i); String src = e.attr("src"); if ("".equals(src) || (src.indexOf("readability") == -1 && src.indexOf("typekit") == -1)) { e.remove(); } } } //some pages have a <p></p> combiantion to generate a space, but //readability seems to ignore it. convert then to a single <p> private void handlePP() { String inner = document.body().html(); inner.replaceAll("<p></p>", "<p>"); document.body().html(inner); } private void handleDoubleBr() { Elements doubleBrs = document.select("br + br"); for (Element br : doubleBrs) { // we hope that there's a 'p' up there.... Elements parents = br.parents(); Element parent = null; for (Element aparent : parents) { if (aparent.tag().getName().equals("p")) { parent = aparent; break; } } if (parent == null) { parent = br.parent(); parent.wrap("<p></p>"); } // now it's safe to make the change. String inner = parent.html(); inner = Patterns.REPLACE_BRS.matcher(inner).replaceAll("</p><p>"); parent.html(inner); } } private void prepDocument() { /** * In some cases a body element can't be found (if the HTML is totally hosed for example) so we create * a new body node and append it to the document. */ if (body == null) { body = document.appendElement("body"); } body.attr("id", "readabilityBody"); Elements frames = document.getElementsByTag("frame"); if (frames.size() > 0) { LOG.error("Frames. Can't deal. Write code later to look at URLs and fetch"); impossible = true; return; } Elements stylesheets = document.getElementsByTag("style"); stylesheets.remove(); stylesheets = document.select("link[rel='stylesheet']"); stylesheets.remove(); /* Turn all double br's into p's */ /* * Note, this is pretty costly as far as processing goes. Maybe optimize later. */ handlePP(); handleDoubleBr(); fontsToSpans(); } private void fontsToSpans() { Elements allFonts = document.getElementsByTag("font"); for (Element fontElement : allFonts) { changeElementTag(fontElement, "span"); } } private String normalizeTrailingSlash(String url) { return url.replaceAll("/$", ""); } private void init() { removeScripts(); convertNoscriptToDiv(); // there should never be more than one ... */ Elements bodies = document.getElementsByTag("body"); if (bodies.size() > 1) { LOG.warn("More than one <body/>"); } body = null; body = bodies.get(0); /* * Make sure this document is added to the list of parsed pages first, so we don't double up on the * first page */ parsedPages.add(normalizeTrailingSlash(givenUrl)); //respect the readAllPages flag, very important if a stringPage if (readAllPages) nextPageLink = findNextPageLink(body); if (!notFirstPage) { title = getArticleTitle(); } prepDocument(); Element articleContent = grabArticle(null); if (articleContent == null && !notFirstPage) { // this happens when the content of the page is very short. // we don't believe in super-short next pages. articleText = body.text(); } else { xmlImages.add(articleContent.outerHtml()); articleText = getDisplayText(articleContent); } } private void convertNoscriptToDiv() { Elements noscript = document.getElementsByTag("noscript"); for (Element e : noscript) { changeElementTag(e, "div"); } } private void setContentScore(Element node, double score) { node.attr("data-readability.contentScore", Double.toString(score)); } private boolean isElementScored(Element node) { return node.hasAttr("data-readability.contentScore"); } private void incrementContentScore(Element node, double score) { node.attr("data-readability.contentScore", Double.toString(getContentScore(node) + score)); } private double getContentScore(Element node) { String scoreString = node.attr("data-readability.contentScore"); if ("".equals(scoreString)) { return 0; } else { return Double.parseDouble(scoreString); } } private void initializeNode(Element node) { // CHECKSTYLE:OFF node.attr("readability", "true"); String tagName = node.tagName(); if ("div".equals(tagName)) { incrementContentScore(node, 5); } else if ("pre".equals(tagName) || "td".equals(tagName) || "blockquote".equals(tagName)) { incrementContentScore(node, 3); } else if ("address".equals(tagName) || "ol".equals(tagName) || "ul".equals(tagName) || "dl".equals(tagName) || "dd".equals(tagName) || "dt".equals(tagName) || "li".equals(tagName) || "form".equals(tagName)) { incrementContentScore(node, -3); } else if (tagName.matches("h[1-6]") || "th".equals(tagName)) { incrementContentScore(node, -5); } incrementContentScore(node, getClassWeight(node)); // CHECKSTYLE:ON } /** * Get an elements class/id weight. Uses regular expressions to tell if this element looks good or bad. * * @param Element * @return number (Integer) **/ private double getClassWeight(Element e) { if (!classWeight) { return 0; } int weight = 0; /* Look for a special classname */ String className = e.className(); if (!"".equals(className)) { if (Patterns.exists(Patterns.NEGATIVE, className)) { weight -= 25; } if (Patterns.exists(Patterns.POSITIVE, className)) { weight += 25; } } /* Look for a special ID */ String id = e.id(); if (!"".equals(id)) { if (Patterns.exists(Patterns.NEGATIVE, id)) { weight -= 25; } if (Patterns.exists(Patterns.POSITIVE, id)) { weight += 25; } } return weight; } private Element changeElementTag(Element e, String newTag) { Element newElement = document.createElement(newTag); /* JSoup gives us the live child list, so we need to make a copy. */ List<Node> copyOfChildNodeList = new ArrayList<Node>(); copyOfChildNodeList.addAll(e.childNodes()); for (Node n : copyOfChildNodeList) { n.remove(); newElement.appendChild(n); } e.replaceWith(newElement); return newElement; } // CHECKSTYLE:OFF private Element grabArticle(Element pageElement) { boolean isPaging = pageElement != null; if (pageElement == null) { pageElement = body; } String pageCacheHtml = pageElement.html(); Elements allElements = pageElement.getAllElements(); /* * Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its * children would remove themselves. To get the same effect, we make a linked list and we remove * things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be * doing something nearly as awful. */ LinkedList<Element> allElementsList = new LinkedList<Element>(); allElementsList.addAll(allElements); /** * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), * and turn divs into P tags where they have been used inappropriately (as in, where they contain no * other block level elements.) Note: Assignment from index for performance. See * http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse * traversal? **/ List<Element> nodesToScore = new ArrayList<Element>(); ListIterator<Element> elIterator = allElementsList.listIterator(); Set<Element> goodAsDead = new HashSet<Element>(); while (elIterator.hasNext()) { Element node = elIterator.next(); if (goodAsDead.contains(node)) { continue; } /* Remove unlikely candidates */ if (stripUnlikelyCandidates) { String unlikelyMatchString = node.className() + node.id(); if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) && !"body".equals(node.tagName())) { LOG.debug("Removing unlikely candidate - " + unlikelyMatchString); List<Element> toRemoveAndBelow = node.getAllElements(); elIterator.remove(); /* * adding 'node' to that set is harmless and reduces the code complexity here. */ goodAsDead.addAll(toRemoveAndBelow); continue; } } if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) { nodesToScore.add(node); } /* * Turn all divs that don't have children block level elements into p's */ if ("div".equals(node.tagName())) { boolean hasBlock = false; for (Element divChild : node.getAllElements()) { if (divChild != node) { if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) { hasBlock = true; break; } } } if (!hasBlock) { Element newElement = changeElementTag(node, "p"); nodesToScore.remove(node); nodesToScore.add(newElement); } else { /* EXPERIMENTAL *//* * grab just child text and wrap each chunk in a p */ int limit = node.childNodes().size(); for (int i = 0; i < limit; i++) { Node childNode = node.childNodes().get(i); if (childNode instanceof TextNode) { Element p = document.createElement("p"); p.attr("basisInline", "true"); p.html(((TextNode)childNode).text()); childNode.replaceWith(p); } } } } } /** * Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add * their score to their parent node. A score is determined by things like number of commas, class * names, etc. Maybe eventually link density. **/ List<Element> candidates = new ArrayList<Element>(); for (Element nodeToScore : nodesToScore) { Element parentNode = nodeToScore.parent(); if (null == parentNode) { // might be an orphan whose parent was // dropped previously. continue; } Element grandParentNode = parentNode.parent(); if (grandParentNode == null) { continue; // ditto } String innerText = nodeToScore.text(); /* * If this paragraph is less than 25 characters, don't even count it. */ if (innerText.length() < 25) { continue; } /* Initialize readability data for the parent. */ if ("".equals(parentNode.attr("readability"))) { initializeNode(parentNode); candidates.add(parentNode); } /* Initialize readability data for the grandparent. */ /* * If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that * we're operating in an orphan. */ if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) { initializeNode(grandParentNode); candidates.add(grandParentNode); } double contentScore = 0; /* Add a point for the paragraph itself as a base. */ contentScore++; /* Add points for any commas within this paragraph */ contentScore += innerText.split(",").length; /* * For every 100 characters in this paragraph, add another point. Up to 3 points. */ contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0); /* Add the score to the parent. The grandparent gets half. */ incrementContentScore(parentNode, contentScore); if (grandParentNode != null) { incrementContentScore(grandParentNode, contentScore / 2.0); } } /** * After we've calculated scores, loop through all of the possible candidate nodes we found and find * the one with the highest score. **/ Element topCandidate = null; for (Element candidate : candidates) { /** * Scale the final candidates score based on link density. Good content should have a relatively * small link density (5% or less) and be mostly unaffected by this operation. **/ double score = getContentScore(candidate); double newScore = score * (1.0 - getLinkDensity(candidate)); setContentScore(candidate, newScore); LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" + candidate.id() + ") with score " + newScore); if (null == topCandidate || newScore > getContentScore(topCandidate)) { topCandidate = candidate; } } /** * If we still have no top candidate, just use the body as a last resort. We also have to copy the * body node so it is something we can modify. **/ if (topCandidate == null || topCandidate == body) { topCandidate = document.createElement("div"); // not efficient but not likely. topCandidate.html(pageElement.html()); pageElement.html(""); pageElement.appendChild(topCandidate); initializeNode(topCandidate); } /** * Now that we have the top candidate, look through its siblings for content that might also be * related. Things like preambles, content split by ads that we removed, etc. **/ Element articleContent = document.createElement("div"); if (isPaging) { articleContent.attr("id", "readability-content"); } double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2); List<Element> siblingNodes = topCandidate.parent().children(); for (Element siblingNode : siblingNodes) { boolean scored = isElementScored(siblingNode); boolean append = false; LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() + ":" + siblingNode.id() + ")"); if (scored) { LOG.debug("Sibling has score " + getContentScore(siblingNode)); } else { LOG.debug("Sibling has score unknown"); } if (siblingNode == topCandidate) { append = true; } double contentBonus = 0; /* * Give a bonus if sibling nodes and top candidates have the example same classname */ if (siblingNode.className().equals(topCandidate.className()) && !"".equals(topCandidate.className())) { contentBonus += getContentScore(topCandidate) * 0.2; } if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) { append = true; } if ("p".equals(siblingNode.tagName())) { double linkDensity = getLinkDensity(siblingNode); String nodeContent = siblingNode.text(); int nodeLength = nodeContent.length(); if (nodeLength > 80 && linkDensity < 0.25) { append = true; } else if (nodeLength < 80 && linkDensity == 0 && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) { append = true; } } if (append) { LOG.debug("Appending node: [" + siblingNode.getClass() + "]"); Element nodeToAppend = null; if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) { /* * We have a node that isn't a common block level element, like a form or td tag. Turn it * into a div so it doesn't get filtered out later by accident. */ LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div."); nodeToAppend = changeElementTag(siblingNode, "div"); } else { nodeToAppend = siblingNode; } /* * To ensure a node does not interfere with readability styles, remove its classnames */ nodeToAppend.removeAttr("class"); /* * Append sibling and subtract from our list because it removes the node when you append to * another node */ articleContent.appendChild(nodeToAppend); } } document.body().empty(); document.body().appendChild(articleContent); /** * So we have all of the content that we need. Now we clean it up for presentation. **/ prepArticle(articleContent); /** * Now that we've gone through the full algorithm, check to see if we got any meaningful content. If * we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher * likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding * the -right- content. **/ if (articleContent.text().length() < 250) { pageElement.html(pageCacheHtml); if (stripUnlikelyCandidates) { try { stripUnlikelyCandidates = false; return grabArticle(pageElement); } finally { stripUnlikelyCandidates = true; } } else if (classWeight) { try { classWeight = false; return grabArticle(pageElement); } finally { classWeight = true; } } else if (cleanConditionally) { try { cleanConditionally = false; return grabArticle(pageElement); } finally { cleanConditionally = true; } } else { return null; } } return articleContent; } private String getDisplayText(Element e) { HtmlPage htmlPage = new HtmlPage(); htmlPage.process(document); String thisText = htmlPage.getPcData(); LOG.debug("Text: " + thisText); return thisText; } /** * Clean an element of all tags of type "tag" if they look fishy. "Fishy" is an algorithm based on content * length, classnames, link density, number of images & embeds, etc. * * @return void **/ private void cleanConditionally(Element e, String tag) { if (!cleanConditionally) { return; } Elements tagsList = e.getElementsByTag(tag); int curTagsLength = tagsList.size(); /** * Gather counts for other typical elements embedded within. Traverse backwards so we can remove nodes * at the same time without effecting the traversal. TODO: Consider taking into account original * contentScore here. **/ for (int i = curTagsLength - 1; i >= 0; i--) { Element ee = tagsList.get(i); if (ee.ownerDocument() == null) { continue; // it a child of something we've already killed, so it // has no document. } double weight = getClassWeight(ee); double contentScore = getContentScore(ee); LOG.debug("Cleaning Conditionally [" + ee.getClass() + "] (" + ee.className() + ":" + ee.id() + ")" + contentScore); if (weight + contentScore < 0) { LOG.debug("Negative content score"); ee.remove(); } else if (getCharCount(ee, ',') < 10) { /** * If there are not very many commas, and the number of non-paragraph elements is more than * paragraphs or other ominous signs, remove the element. **/ int p = ee.getElementsByTag("p").size(); int img = ee.getElementsByTag("img").size(); int li = ee.getElementsByTag("li").size() - 100; int input = ee.getElementsByTag("input").size(); Elements embeds = ee.getElementsByTag("embed"); int embedCount = embeds.size(); // removed code that pays specific attention to youtube. double linkDensity = getLinkDensity(ee); int contentLength = ee.text().length(); boolean toRemove = false; if (img > p) { toRemove = true; } else if (li > p && !"ul".equals(tag) && !"ol".equals(tag)) { toRemove = true; } else if (input > Math.floor(p / 3)) { toRemove = true; } else if (contentLength < 25 && (img == 0 || img > 2)) { toRemove = true; } else if (weight < 25 && linkDensity > 0.2) { toRemove = true; } else if (weight >= 25 && linkDensity > 0.5) { toRemove = true; } else if ((embedCount == 1 && contentLength < 75) || embedCount > 1) { toRemove = true; } if (toRemove) { LOG.debug("failed keep tests."); ee.remove(); } } } } /** * Clean out spurious headers from an Element. Checks things like classnames and link density. * * @param Element * @return void **/ private void cleanHeaders(Element e) { for (int headerIndex = 1; headerIndex < 3; headerIndex++) { Elements headers = e.getElementsByTag("h" + headerIndex); for (int i = headers.size() - 1; i >= 0; i--) { if (getClassWeight(headers.get(i)) < 0 || getLinkDensity(headers.get(i)) > 0.33) { headers.get(i).remove(); } } } } /** * Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous * <p> * tags, etc. This takes an element in, but returns a string. * * @param Element * @return void **/ private void prepArticle(Element articleContent) { // we don't need to do this, we don't care cleanStyles(articleContent); // this replaces any break element or an nbsp with a plain break // element. // not needed. We will deal with breaks as we deal with breaks // killBreaks(articleContent); /* Clean out junk from the article content */ cleanConditionally(articleContent, "form"); clean(articleContent, "object"); clean(articleContent, "h1"); /** * If there is only one h2, they are probably using it as a header and not a subheader, so remove it * since we already have a header. ***/ if (articleContent.getElementsByTag("h2").size() == 1) { clean(articleContent, "h2"); } clean(articleContent, "iframe"); cleanHeaders(articleContent); /* * Do these last as the previous stuff may have removed junk that will affect these */ cleanConditionally(articleContent, "table"); cleanConditionally(articleContent, "ul"); //could have no children, will crash then if (articleContent.children().size() != 0) { cleanConditionally(articleContent.child(0), "div"); } /* Remove extra paragraphs */ Elements articleParagraphs = articleContent.getElementsByTag("p"); for (Element para : articleParagraphs) { int imgCount = para.getElementsByTag("img").size(); int embedCount = para.getElementsByTag("embed").size(); int objectCount = para.getElementsByTag("object").size(); if (imgCount == 0 && embedCount == 0 && objectCount == 0 && para.text().matches("\\s*")) { para.remove(); } } Elements parasWithPreceedingBreaks = articleContent.getElementsByTag("br + p"); for (Element pe : parasWithPreceedingBreaks) { Element brElement = pe.previousElementSibling(); brElement.remove(); } } private void cleanStyles(Element articleContent) { // we want to clear off the style attributes in case they influence // something else. for (Element e : articleContent.getAllElements()) { e.removeAttr("style"); } } /** * Clean a node of all elements of type "tag". * * @param Element * @param string tag to clean **/ private void clean(Element e, String tag) { Elements targetList = e.getElementsByTag(tag); targetList.remove(); } private double getLinkDensity(Element e) { Elements links = e.getElementsByTag("a"); double textLength = e.text().length(); double linkLength = 0; for (Element link : links) { linkLength += link.text().length(); } return linkLength / textLength; } private String getArticleTitle() { String curTitle = ""; String origTitle = ""; Elements titleElements = document.getElementsByTag("title"); if (titleElements.size() > 0) { if (titleElements.size() > 1) { LOG.warn("More than one title."); } curTitle = titleElements.get(0).text(); origTitle = curTitle; } if (Patterns.exists(Patterns.BAR_DASH, curTitle)) { curTitle = origTitle.replaceAll("(.*)[\\|\\-] .*", "$1"); if (curTitle.split(" ").length < 3) { curTitle = origTitle.replaceAll("[^\\|\\-]*[\\|\\-](.*)", "$1"); } } else if (curTitle.indexOf(": ") != -1) { curTitle = origTitle.replaceAll(".*:(.*)", "$1"); if (curTitle.split(" ").length < 3) { curTitle = origTitle.replaceAll("[^:]*[:](.*)", "$1"); } } else if (curTitle.length() > 150 || curTitle.length() < 15) { Elements hOnes = document.getElementsByTag("h1"); if (hOnes.size() == 1) { curTitle = hOnes.get(0).text(); } } curTitle = curTitle.trim(); if (curTitle.split(" ").length <= 4) { curTitle = origTitle; } return curTitle; } private String findBaseUrl(String stringUrl) { try { URI base = findBaseUrl0(stringUrl); return base.toString(); } catch (URISyntaxException e) { LOG.debug("Failed to get base URI", e); return null; } } private URI findBaseUrl0(String stringUrl) throws URISyntaxException { //Compensate for Windows path names. stringUrl = stringUrl.replace("\\", "/"); int qindex = stringUrl.indexOf("?"); if (qindex != -1) { // stuff after the ? tends to make the Java URL parser burp. stringUrl = stringUrl.substring(0, qindex); } URI url = new URI(stringUrl); URI baseUrl = new URI(url.getScheme(), url.getAuthority(), url.getPath(), null, null); String path = baseUrl.getPath().substring(1); // toss the leading / String[] pieces = path.split("/"); List<String> urlSlashes = new ArrayList<String>(); // reverse for (String piece : pieces) { urlSlashes.add(piece); } List<String> cleanedSegments = new ArrayList<String>(); String possibleType = ""; boolean del; for (int i = 0; i < urlSlashes.size(); i++) { String segment = urlSlashes.get(i); // Split off and save anything that looks like a file type. if (segment.indexOf(".") != -1) { possibleType = segment.split("\\.")[1]; /* * If the type isn't alpha-only, it's probably not actually a file extension. */ if (!possibleType.matches("[^a-zA-Z]")) { segment = segment.split("\\.")[0]; } } /** * EW-CMS specific segment replacement. Ugly. Example: * http://www.ew.com/ew/article/0,,20313460_20369436,00.html **/ if (segment.indexOf(",00") != -1) { segment = segment.replaceFirst(",00", ""); } // If our first or second segment has anything looking like a page // number, remove it. /* Javascript code has some /i's here, we might need to fiddle */ Matcher pnMatcher = Patterns.PAGE_NUMBER_LIKE.matcher(segment); if (pnMatcher.matches() && ((i == 1) || (i == 0))) { segment = pnMatcher.replaceAll(""); } del = false; /* * If this is purely a number, and it's the first or second segment, it's probably a page number. * Remove it. */ if (i < 2 && segment.matches("^\\d{1,2}$")) { del = true; } /* If this is the first segment and it's just "index", remove it. */ if (i == 0 && segment.toLowerCase() == "index") del = true; /* * If our first or second segment is smaller than 3 characters, and the first segment was purely * alphas, remove it. */ /* /i again */ if (i < 2 && segment.length() < 3 && !urlSlashes.get(0).matches("[a-z]")) del = true; /* If it's not marked for deletion, push it to cleanedSegments. */ if (!del) { cleanedSegments.add(segment); } } String cleanedPath = ""; for (String s : cleanedSegments) { cleanedPath = cleanedPath + s; cleanedPath = cleanedPath + "/"; } URI cleaned = new URI(url.getScheme(), url.getAuthority(), "/" + cleanedPath.substring(0, cleanedPath .length() - 1), null, null); return cleaned; } /* * Officially parsing URL's from HTML pages is a mug's game. */ private String getUrlHost(String url) { // httpx://host/..... int hostStart = url.indexOf("//"); if (hostStart == -1) { return ""; } int hostEnd = url.indexOf("/", hostStart + 2); if (hostEnd == -1) { return url.substring(hostStart + 2); } else { return url.substring(hostStart + 2, hostEnd); } } private String findNextPageLink(Element body) { Map<String, PageLinkInfo> possiblePages = new HashMap<String, PageLinkInfo>(); Elements allLinks = body.getElementsByTag("a"); String articleBaseUrl = findBaseUrl(givenUrl); String baseHost = getUrlHost(articleBaseUrl); /** * Loop through all links, looking for hints that they may be next-page links. Things like having * "page" in their textContent, className or id, or being a child of a node with a page-y className or * id. Also possible: levenshtein distance? longest common subsequence? After we do that, assign each * page a score, and **/ for (Element link : allLinks) { String linkHref = link.attr("abs:href").replaceAll("#.*$", "").replaceAll("/$", ""); /* If we've already seen this page, ignore it */ if ("".equals(linkHref) || linkHref.equals(articleBaseUrl) || linkHref.equals(givenUrl) || parsedPages.contains(linkHref)) { continue; } String linkHost = getUrlHost(linkHref); /* If it's on a different domain, skip it. */ if (!linkHost.equals(baseHost)) { continue; } String linkText = link.text(); // like innerText /* If the linkText looks like it's not the next page, skip it. */ if (Patterns.EXTRANEOUS.matcher(linkText).matches() || linkText.length() > 25) { continue; } /* * If the leftovers of the URL after removing the base URL don't contain any digits, it's * certainly not a next page link. */ String linkHrefLeftover = linkHref.replaceFirst(articleBaseUrl, ""); if (!Patterns.exists(Patterns.DIGIT, linkHrefLeftover)) { continue; } PageLinkInfo linkObj = possiblePages.get(linkHref); if (linkObj == null) { linkObj = new PageLinkInfo(0.0, linkText, linkHref); possiblePages.put(linkHref, linkObj); } else { String newLinkText = linkObj.getLinkText() + " | " + linkText; linkObj.setLinkText(newLinkText); } /** * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, * but the odds are lower. Example: * http://www.actionscript.org/resources/articles/745/1/JavaScript * -and-VBScript-Injection-in-ActionScript-3/Page1.html **/ if (linkHref.indexOf(articleBaseUrl) != 0) { linkObj.incrementScore(-25); } String linkData = linkText + " " + link.className() + " " + link.id(); if (Patterns.exists(Patterns.NEXT_LINK, linkData)) { linkObj.incrementScore(50); } if (Patterns.exists(Patterns.PAGINATION, linkData)) { linkObj.incrementScore(25); } if (Patterns.exists(Patterns.FIRST_OR_LAST, linkData)) { // -65 is enough to negate any bonuses gotten from a > or ยป in // the text, /* * If we already matched on "next", last is probably fine. If we didn't, then it's bad. * Penalize. */ if (!Patterns.exists(Patterns.NEXT_LINK, linkObj.getLinkText())) { linkObj.incrementScore(-65); } } if (Patterns.exists(Patterns.NEGATIVE, linkData) || Patterns.exists(Patterns.EXTRANEOUS, linkData)) { linkObj.incrementScore(-50); } if (Patterns.exists(Patterns.PREV_LINK, linkData)) { linkObj.incrementScore(-200); } /* If a parentNode contains page or paging or paginat */ Element parentNode = link.parent(); boolean positiveNodeMatch = false; boolean negativeNodeMatch = false; while (parentNode != null) { String parentNodeClassAndId = parentNode.className() + " " + parentNode.id(); if (!positiveNodeMatch && Patterns.match(Patterns.PAGINATION, parentNodeClassAndId)) { positiveNodeMatch = true; linkObj.incrementScore(25); } if (!negativeNodeMatch && Patterns.match(Patterns.NEGATIVE, parentNodeClassAndId)) { /* * If this is just something like "footer", give it a negative. If it's something like * "body-and-footer", leave it be. */ if (!Patterns.exists(Patterns.POSITIVE, parentNodeClassAndId)) { linkObj.incrementScore(-25); negativeNodeMatch = true; } } parentNode = parentNode.parent(); } /** * If the URL looks like it has paging in it, add to the score. Things like /page/2/, /pagenum/2, * ?p=3, ?page=11, ?pagination=34 **/ if (Patterns.exists(Patterns.PAGE_AND_NUMBER, linkHref) || Patterns.exists(Patterns.PAGE_OR_PAGING, linkHref)) { linkObj.incrementScore(+25); } /* If the URL contains negative values, give a slight decrease. */ if (Patterns.exists(Patterns.EXTRANEOUS, linkHref)) { linkObj.incrementScore(-15); } /** * Minor punishment to anything that doesn't match our current URL. NOTE: I'm finding this to * cause more harm than good where something is exactly 50 points. Dan, can you show me a * counterexample where this is necessary? if (linkHref.indexOf(window.location.href) !== 0) { * linkObj.score -= 1; } **/ /** * If the link text can be parsed as a number, give it a minor bonus, with a slight bias towards * lower numbered pages. This is so that pages that might not have 'next' in their text can still * get scored, and sorted properly by score. **/ boolean linkNumeric = false; int linkTextAsNumber = 0; try { linkTextAsNumber = Integer.parseInt(linkText); linkNumeric = true; } catch (NumberFormatException e) { } if (linkNumeric) { // Punish 1 since we're either already there, or it's probably // before what we want anyways. if (linkTextAsNumber == 1) { linkObj.incrementScore(-10); } else { // Todo: Describe this better linkObj.incrementScore(Math.max(0, 10 - linkTextAsNumber)); } } } /** * Loop through all of our possible pages from above and find our top candidate for the next page URL. * Require at least a score of 50, which is a relatively high confidence that this page is the next * link. **/ PageLinkInfo topPage = null; for (Map.Entry<String, PageLinkInfo> pageEntry : possiblePages.entrySet()) { if (pageEntry.getValue().getScore() >= 50 && (topPage == null || topPage.getScore() < pageEntry.getValue().getScore())) { topPage = pageEntry.getValue(); } } if (topPage != null) { String nextHref = topPage.getHref().replaceFirst("/$", ""); LOG.debug("Next page = " + nextHref); parsedPages.add(nextHref); return nextHref; } else { return null; } } /** * Get the number of times a string s appears in the node e. * * @param Element * @param string - what to split on. Default is "," * @return number (integer) **/ int getCharCount(Element e, char s) { return e.text().split(Character.toString(s)).length - 1; } public void setPageReader(PageReader pageReader) { this.pageReader = pageReader; } public PageReader getPageReader() { return pageReader; } public boolean isImpossible() { return impossible; } public String getNextPageLink() { return nextPageLink; } public String getTitle() { return title; } public String getArticleText() { return articleText; } public void setReadAllPages(boolean readAllPages) { this.readAllPages = readAllPages; } public boolean isReadAllPages() { return readAllPages; } public List<String> getXmlImages() { return xmlImages; } }